NOTE: all the helper classes, functions and other objects created in this notebook can be stored in separate files for iterative/deployment purposes, but in order to follow the development logic and make it more convinient for a reader the notebook consists of all detailed inputs/outputs and objects.
# Basics
import pandas as pd
import numpy as np
# Visualization
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
# Models
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
# Feature importance
import shap
# Optimization and cv
from bayes_opt import BayesianOptimization # docs are here https://github.com/fmfn/BayesianOptimization
from sklearn.model_selection import KFold
# Metrics
from sklearn.metrics import mean_squared_error, r2_score
# Serialization and helpers
import joblib
from typing import Union
# Fix random_state for experiments reproducibility
RANDOM_STATE = 42
# Set notebook mode to work in offline
import warnings
pyo.init_notebook_mode()
warnings.filterwarnings('ignore')
# Check what we have in the data
df = pd.read_csv('2016-09-19_79351_training.csv')
mcc_group = pd.read_csv('mcc_group_definition.csv')
transaction_types = pd.read_csv('transaction_types.csv')
df.head()
| user_id | transaction_date | transaction_type | mcc_group | amount_n26_currency | dataset_transaction | dataset_user | |
|---|---|---|---|---|---|---|---|
| 0 | a78884f5e76951188c1e719d4956773a | 2016-02-01 | DT | NaN | 350 | training | training |
| 1 | b0333294fef6ff1299102a70ad46b126 | 2016-02-01 | DT | NaN | 202 | training | training |
| 2 | 7b8d2a2780adae0cd0c248e92c1b28dc | 2016-02-01 | DT | NaN | 291 | training | training |
| 3 | cc4abaa500f7db4390ae3f02bd36d805 | 2016-02-01 | DT | NaN | 214 | training | training |
| 4 | 49b99d1d5ba028566639e8b3eb7c055b | 2016-02-01 | DT | NaN | 272 | training | training |
mcc_group.head()
| mcc_group | explanation | |
|---|---|---|
| 0 | 1 | ATM |
| 1 | 2 | Groceries |
| 2 | 3 | Gastronomie |
| 3 | 4 | Shopping/(Fashion) |
| 4 | 5 | Home and DIY, House Energy |
transaction_types.head()
| type | explanation | direction | agent | |
|---|---|---|---|---|
| 0 | AR | Authorization reject | - | Card |
| 1 | AE | Authorization expired | In | Card |
| 2 | AV | Authorization Reversal | In | Card |
| 3 | BBU | Cash26 deposit | In | Partner |
| 4 | CT | Credit Transfer | In | Bank Account |
# Left join types to the initial df and map mcc groups
df = (df.merge(transaction_types, how='left', left_on='transaction_type', right_on='type').drop('type', axis=1)
.merge(mcc_group, how='left', on='mcc_group'))
df = df.rename(columns = {'explanation_x': 'explanation_type',
'explanation_y': 'explanation_mcc'}) # rename explanations columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 408546 entries, 0 to 408545 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 408546 non-null object 1 transaction_date 408546 non-null object 2 transaction_type 408546 non-null object 3 mcc_group 258324 non-null float64 4 amount_n26_currency 408546 non-null int64 5 dataset_transaction 408546 non-null object 6 dataset_user 408546 non-null object 7 explanation_type 408546 non-null object 8 direction 408546 non-null object 9 agent 408546 non-null object 10 explanation_mcc 258324 non-null object dtypes: float64(1), int64(1), object(9) memory usage: 37.4+ MB
# EDA plots functions
def plot_counts(df:pd.DataFrame, feature:str):
"""Plots counts for a categorical feature in descending order"""
fig = sns.catplot(x=feature, data=df,
height=4, aspect=1.5, kind='count',
order=df[feature].value_counts().index)
fig.set_xticklabels(rotation=90) # avoid axis overlaping
def plot_total_volumes(df:pd.DataFrame, feature:str, amount_column:str):
"""Plots total volumes for a categorical feature in descending order"""
df_plot = df.copy()
df_plot = df_plot.groupby(feature)[amount_column].sum().reset_index()
fig = sns.barplot(data=df_plot, x=feature, y=amount_column, width=0.8,
order=df_plot.sort_values(by=amount_column, ascending=False)[feature])
fig.set_xticklabels(labels=df_plot.sort_values(by=amount_column, ascending=False)[feature]
, rotation=90) # avoid axis overlaping
def plot_daily_volumes(df:pd.DataFrame, feature:str, amount_column:str, date_column:str):
"""Plots total daily mean volumes for a categorical feature"""
df_plot = df.copy()
df_plot = df_plot.groupby([feature, date_column])[amount_column].mean().reset_index()
fig = go.Figure()
for i in list(df_plot[feature].unique()):
fig.add_trace(go.Scatter(x=df_plot[df_plot[feature] == i][date_column].values,
y=df_plot[df_plot[feature] == i][amount_column].values,
mode='lines', name=i))
config = {'displayModeBar': False} # turn off display mode to save time
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', # white background
plot_bgcolor='rgba(0,0,0,0)',
title=f"Daily mean volumes by {feature}")
# fig.update_xaxes(showline=True, gridcolor='grey') # add grid
# fig.update_yaxes(showline=True, gridcolor='grey')
fig.show(config=config)
# Unique users
df['user_id'].nunique()
10000
# Unique users month to month
for month in list(df['transaction_date'].apply(lambda x: x[:7]).unique()):
print(f"{month}: {df.loc[df['transaction_date'].str.contains(month)]['user_id'].nunique()} users")
2016-02: 7188 users 2016-03: 7438 users 2016-04: 7402 users 2016-05: 7404 users 2016-06: 7398 users 2016-07: 7384 users
NOTE: The number of users is not the same over time and the users sample may vary month to month.
# Unique types
df['explanation_type'].unique()
array(['Debit Transfer', 'Credit Transfer', 'Debit Reversal',
'MoneyBeam Transfer', 'Direct Debit', 'Presentment',
'Cash26 deposit', 'Cash26 withdrawal', 'Transferwise Transaction'],
dtype=object)
# Explanation of the types when directions is out and in
print(f"Out transactions: {df[df['direction'] == 'Out'].explanation_type.unique()}")
print(f"In transactions: {df[df['direction'] == 'In'].explanation_type.unique()}")
Out transactions: ['Debit Transfer' 'MoneyBeam Transfer' 'Direct Debit' 'Presentment' 'Cash26 withdrawal' 'Transferwise Transaction'] In transactions: ['Credit Transfer' 'Debit Reversal' 'Cash26 deposit']
# Save them as a lists
out_transactions = list(df[df['direction'] == 'Out'].explanation_type.unique())
in_transactions = list(df[df['direction'] == 'In'].explanation_type.unique())
# Plot the types counts
plot_counts(df, 'explanation_type')
# Plot the types volumes
plot_total_volumes(df, 'explanation_type', 'amount_n26_currency')
# Plot daily average volumes for incoming transactions
plot_daily_volumes(df[df['explanation_type'].isin(in_transactions)],
'explanation_type', 'amount_n26_currency', 'transaction_date')
# Plot daily average volumes for outgoing transactions
plot_daily_volumes(df[df['explanation_type'].isin(out_transactions)],
'explanation_type', 'amount_n26_currency', 'transaction_date')
NOTES: The transaction types are fixed for In and Out directions. Most transactions are "presentment" type which reflects the card purchases. Credit transfer has some peaks in the end of each month which is likely salary days for employed customers. Debit transfer shows high daily average amount and second high total value, it is usually used for apartment rent payments or big purchases and should be relatively stable over time.
# Amount is in abs values, no zeros
df[df['amount_n26_currency'] <= 0]
| user_id | transaction_date | transaction_type | mcc_group | amount_n26_currency | dataset_transaction | dataset_user | explanation_type | direction | agent | explanation_mcc |
|---|
# Check the amount distributions
sns.histplot(df[df['direction'] == 'In']['amount_n26_currency'])
<AxesSubplot: xlabel='amount_n26_currency', ylabel='Count'>
The transactional data looks left skewed. May apply logarithms in the future transforamtion
# Datasets transaction and user are the same, we can drop them
df[df['dataset_transaction'] != df['dataset_user']]
| user_id | transaction_date | transaction_type | mcc_group | amount_n26_currency | dataset_transaction | dataset_user | explanation_type | direction | agent | explanation_mcc |
|---|
# Check for which agents mcc is not null
df[~df['mcc_group'].isna()].agent.unique()
array(['Card'], dtype=object)
# Check for which transaction types mcc is not null
df[~df['mcc_group'].isna()].explanation_type.unique()
array(['Presentment'], dtype=object)
# Check if we have empty mcc group for Presentment
df[df['explanation_type'] == 'Presentment'].mcc_group.isna().sum()
0
# Plot the types counts
plot_counts(df, 'explanation_mcc')
# Plot the types volumes
plot_total_volumes(df, 'explanation_mcc', 'amount_n26_currency')
# Plot daily average volumes by card transactions
plot_daily_volumes(df, 'explanation_mcc', 'amount_n26_currency', 'transaction_date')
NOTES: the highest volume in terms of number of transactions and total amount is for ATM, also, customers often use cards in grocery stores and travel/hotel bookings. Presentment transaction type is fully covered by mcc_group types. There is no empty mcc_group values for presentment.
# Check unique types when agent is card
df[df['agent'] == 'Card'].explanation_type.unique()
array(['Presentment'], dtype=object)
# Check partner agents transactions types
df[df['agent'] == 'Partner'].explanation_type.value_counts()
Cash26 deposit 3561 Cash26 withdrawal 942 Transferwise Transaction 230 Name: explanation_type, dtype: int64
# Check if NaN in mcc group when agent is card
df[df['agent'] == 'Card'].mcc_group.isna().sum()
0
# Plot the agent counts
plot_counts(df, 'agent')
# Plot the agent volumes
plot_total_volumes(df, 'agent', 'amount_n26_currency')
# Plot daily average volumes by agent
plot_daily_volumes(df, 'agent', 'amount_n26_currency', 'transaction_date')
# Check if we have transactions without direction
df[~df['direction'].isin(['Out', 'In'])]
| user_id | transaction_date | transaction_type | mcc_group | amount_n26_currency | dataset_transaction | dataset_user | explanation_type | direction | agent | explanation_mcc |
|---|
# Plot the directions counts
plot_counts(df, 'direction')
# Plot the directions volumes
plot_total_volumes(df, 'direction', 'amount_n26_currency')
# Plot daily average volumes by direction
plot_daily_volumes(df, 'direction', 'amount_n26_currency', 'transaction_date')
NOTES: most of the transactions are outgoing, the total customers balance is below 0. At the same time average incoming transaction amount is higher which is also expected.
IDEA: all transaction types can be clearly separated by incoming and outgoing. Presentment type transactions can be splitted into more detailed mcc groups which can potentially give us better signals than agregated presentment category. Those transaction (+mcc) types can be used as features. I propose to calculate the total spendings/incomes by customer per each month, each type. Each month's total can be considered as spending/income with lag to the target month. Additionally, let's split the problem into 2 datasets since we need to predict both: incomes and expenses. I also assume the the previous months income can affect future spendings and vice versa.
Let's use the data from Feb, 2016 until Jun, 2016 to create features and Jul, 2016 total incomes/expenses as targets.
# Drop useless columns
df.drop(['dataset_transaction', 'dataset_user', 'transaction_type', 'mcc_group', 'agent'], axis=1, inplace=True)
# Change date type and create month column
df['transaction_date'] = pd.to_datetime(df['transaction_date']) # transform to dt
df['transaction_month'] = df['transaction_date'].apply(lambda x: x.month)
# Add aggregated df to perform faster calculations
agg_df = df.copy()
# replace presentment with mcc
agg_df.loc[agg_df['explanation_type'] == 'Presentment', 'explanation_type'] = (
agg_df.loc[agg_df['explanation_type'] == 'Presentment', 'explanation_mcc'])
agg_df = (agg_df.groupby(['user_id', 'transaction_month', 'explanation_type'])['amount_n26_currency']
.sum().reset_index())
# Change months to more "lag style" names
agg_df['transaction_month'] = agg_df['transaction_month'].apply(lambda x: x-7)
# Create future columns names as combination of Type + Lag month
agg_df['features'] = (agg_df['explanation_type'].apply(lambda x: x + '_') +
agg_df['transaction_month'].apply(lambda x: str(x)))
# Create final table with features
user_df = agg_df.pivot_table(values='amount_n26_currency', index='user_id', columns='features', aggfunc='first')
user_df
| features | ATM_-1 | ATM_-2 | ATM_-3 | ATM_-4 | ATM_-5 | ATM_0 | Automotive/Gas_-1 | Automotive/Gas_-2 | Automotive/Gas_-3 | Automotive/Gas_-4 | ... | Transport_-3 | Transport_-4 | Transport_-5 | Transport_0 | Travel/Hotel_-1 | Travel/Hotel_-2 | Travel/Hotel_-3 | Travel/Hotel_-4 | Travel/Hotel_-5 | Travel/Hotel_0 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | |||||||||||||||||||||
| 000295594379774ab9ac2c78c946d615 | 52.0 | 147.0 | NaN | 380.0 | 280.0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | 4.0 | NaN | NaN | NaN | NaN | NaN |
| 000aa792d73dd82b16a29692772d395a | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 0011103fe4ba1264342882b7ab98c641 | NaN | 25.0 | 48.0 | 27.0 | NaN | 416.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 220.0 | 72.0 | NaN | 146.0 | NaN | NaN | 355.0 |
| 001679c77c33d7efabf800596fb2a978 | 1025.0 | 706.0 | 309.0 | NaN | 220.0 | 926.0 | 17.0 | 15.0 | 37.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 001e72a2f4c4376f10011735dde39cd5 | NaN | 65.0 | NaN | 107.0 | NaN | NaN | NaN | NaN | NaN | 65.0 | ... | 55.0 | NaN | 12.0 | NaN | NaN | NaN | NaN | 38.0 | 39.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| fff571276eaa49853039f6bfe1102fba | NaN | NaN | 11.0 | 11.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| fff6ea4b3659a969c62c17c1d1655394 | NaN | NaN | NaN | 96.0 | 194.0 | NaN | NaN | 16.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 22.0 |
| fff9eec15875baad3df433c8780aa3bd | 85.0 | NaN | NaN | NaN | NaN | 7.0 | NaN | NaN | NaN | NaN | ... | 56.0 | 181.0 | NaN | NaN | 10.0 | 21.0 | 33.0 | 28.0 | NaN | NaN |
| fffd503ed2ecb63067b7369f4d668aed | 59.0 | NaN | 7.0 | 90.0 | 96.0 | 23.0 | 11.0 | 7.0 | 22.0 | NaN | ... | NaN | NaN | NaN | NaN | 23.0 | NaN | NaN | 32.0 | NaN | 6.0 |
| fffde1ba10b4040deb651162f56c9fc4 | 114.0 | 249.0 | 264.0 | 279.0 | 22.0 | 375.0 | NaN | NaN | NaN | NaN | ... | 4.0 | NaN | NaN | 44.0 | NaN | 14.0 | NaN | NaN | NaN | 116.0 |
10000 rows × 150 columns
# Remove July features (month 0) which will be used to calculate targets in order to prevent data likage
mask = [i for i in list(user_df.columns) if i[-1:] != '0']
user_df = user_df[mask]
# Save features names
features_names = list(user_df.columns)
# The idea is to fill in as many NaN values as possible per user per category.
# Each feature is presented with 5 months lags. We can split it in groups of 5 and
# replace the with median based on other months (if at least one available)
# months = 5
# for i in range(0, len(features_names), months):
# user_df.loc[:, features_names[i:i+months]] = (user_df[features_names[i:i+months]].T
# .fillna(user_df[features_names[i:i+months]].median(axis=1)).T) # transp, fill, transp back
# Leave rest of NaN values and let xgboost handle it in the future
# user_df = user_df.fillna(0)
# Add y_true
incomes_july = (df[(df['transaction_month'] == 7) & (df['explanation_type'].isin(in_transactions))]
.groupby('user_id')['amount_n26_currency'].sum()).reset_index()
spending_july = (df[(df['transaction_month'] == 7) & (df['explanation_type'].isin(out_transactions))]
.groupby('user_id')['amount_n26_currency'].sum()).reset_index()
user_df = user_df.merge(incomes_july, how='left', on='user_id').merge(spending_july, how='left', on='user_id')
user_df = user_df.rename(columns={'amount_n26_currency_x': 'y_true_income',
'amount_n26_currency_y': 'y_true_expenses'})
# Add 5 months means as baseline prediction
incomes_means = (df[(df['transaction_month'] != 7) & (df['explanation_type'].isin(in_transactions))]
.groupby('user_id')['amount_n26_currency'].sum()).reset_index()
incomes_means['amount_n26_currency'] = incomes_means['amount_n26_currency']/5 # 5 months average
spending_means = (df[(df['transaction_month'] != 7) & (df['explanation_type'].isin(out_transactions))]
.groupby('user_id')['amount_n26_currency'].sum()).reset_index()
spending_means['amount_n26_currency'] = spending_means['amount_n26_currency']/5
user_df = user_df.merge(incomes_means, how='left', on='user_id').merge(spending_means, how='left', on='user_id')
user_df = user_df.rename(columns={'amount_n26_currency_x': 'y_mean_income',
'amount_n26_currency_y': 'y_mean_expenses'})
user_df
| user_id | ATM_-1 | ATM_-2 | ATM_-3 | ATM_-4 | ATM_-5 | Automotive/Gas_-1 | Automotive/Gas_-2 | Automotive/Gas_-3 | Automotive/Gas_-4 | ... | Transport_-5 | Travel/Hotel_-1 | Travel/Hotel_-2 | Travel/Hotel_-3 | Travel/Hotel_-4 | Travel/Hotel_-5 | y_true_income | y_true_expenses | y_mean_income | y_mean_expenses | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 000295594379774ab9ac2c78c946d615 | 52.0 | 147.0 | NaN | 380.0 | 280.0 | NaN | NaN | NaN | NaN | ... | NaN | 4.0 | NaN | NaN | NaN | NaN | NaN | 9.0 | 188.4 | 178.6 |
| 1 | 000aa792d73dd82b16a29692772d395a | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 155.0 | 106.0 | 132.0 | 120.0 |
| 2 | 0011103fe4ba1264342882b7ab98c641 | NaN | 25.0 | 48.0 | 27.0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | 72.0 | NaN | 146.0 | NaN | NaN | 995.0 | 1008.0 | 217.6 | 76.8 |
| 3 | 001679c77c33d7efabf800596fb2a978 | 1025.0 | 706.0 | 309.0 | NaN | 220.0 | 17.0 | 15.0 | 37.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 845.0 | 1292.0 | 909.4 | 879.0 |
| 4 | 001e72a2f4c4376f10011735dde39cd5 | NaN | 65.0 | NaN | 107.0 | NaN | NaN | NaN | NaN | 65.0 | ... | 12.0 | NaN | NaN | NaN | 38.0 | 39.0 | NaN | NaN | 85.2 | 120.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9995 | fff571276eaa49853039f6bfe1102fba | NaN | NaN | 11.0 | 11.0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 20.0 | 6.0 | 14.6 | 9.6 |
| 9996 | fff6ea4b3659a969c62c17c1d1655394 | NaN | NaN | NaN | 96.0 | 194.0 | NaN | 16.0 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 22.0 | NaN | 62.8 |
| 9997 | fff9eec15875baad3df433c8780aa3bd | 85.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 10.0 | 21.0 | 33.0 | 28.0 | NaN | NaN | 7.0 | 181.6 | 183.4 |
| 9998 | fffd503ed2ecb63067b7369f4d668aed | 59.0 | NaN | 7.0 | 90.0 | 96.0 | 11.0 | 7.0 | 22.0 | NaN | ... | NaN | 23.0 | NaN | NaN | 32.0 | NaN | 272.0 | 320.0 | 214.0 | 99.2 |
| 9999 | fffde1ba10b4040deb651162f56c9fc4 | 114.0 | 249.0 | 264.0 | 279.0 | 22.0 | NaN | NaN | NaN | NaN | ... | NaN | NaN | 14.0 | NaN | NaN | NaN | 1487.0 | 1571.0 | 265.6 | 246.4 |
10000 rows × 130 columns
# Split income and expenses df
df_income = user_df[~user_df['y_true_income'].isna()].drop('y_true_expenses', axis=1)
df_expenses = user_df[~user_df['y_true_expenses'].isna()].drop('y_true_income', axis=1)
# Save the preprocessing steps as a function
def preprocessing(raw_df:pd.DataFrame, transaction_types:pd.DataFrame=transaction_types,
mcc_group:pd.DataFrame=mcc_group, month_predict:str='2016-07') -> Union[pd.DataFrame, pd.DataFrame]:
"""Summarizes preprocessing steps performed above and returnes 2 data frames: expenses and income"""
raw_df = (raw_df.merge(transaction_types, how='left', left_on='transaction_type', right_on='type')
.drop('type', axis=1)
.merge(mcc_group, how='left', on='mcc_group'))
raw_df.rename(columns = {'explanation_x': 'explanation_type',
'explanation_y': 'explanation_mcc'}, inplace=True)
month_predict = pd.to_datetime(month_predict)
raw_df['transaction_date'] = pd.to_datetime(raw_df['transaction_date'].apply(lambda x: x[:7])) # transform to dt
raw_df['transaction_month'] = (raw_df['transaction_date'] # allows to pass any month in YY-mm format
.apply(lambda x: int(round((x - month_predict)/np.timedelta64(1, 'M'), 0))))
out_transactions = list(raw_df[raw_df['direction'] == 'Out'].explanation_type.unique())
in_transactions = list(raw_df[raw_df['direction'] == 'In'].explanation_type.unique())
agg_df = raw_df.copy()
agg_df.loc[agg_df['explanation_type'] == 'Presentment', 'explanation_type'] = (
agg_df.loc[agg_df['explanation_type'] == 'Presentment', 'explanation_mcc'])
agg_df = (agg_df.groupby(['user_id', 'transaction_month', 'explanation_type'])['amount_n26_currency']
.sum()
.reset_index())
agg_df['features'] = (agg_df['explanation_type'].apply(lambda x: x + '_') +
agg_df['transaction_month'].apply(lambda x: str(x)))
# Aggregate and keep same columns (up to lag 5 months only) as used during training
agg_df = agg_df.pivot_table(values='amount_n26_currency', index='user_id', columns='features', aggfunc='first')
agg_df = agg_df[features]
# Add y_true and means
incomes = (raw_df[(raw_df['transaction_month'] == 0) &
(raw_df['explanation_type'].isin(in_transactions))]
.groupby('user_id')['amount_n26_currency'].sum()).reset_index()
spending = (raw_df[(raw_df['transaction_month'] == 0) &
(raw_df['explanation_type'].isin(out_transactions))]
.groupby('user_id')['amount_n26_currency'].sum()).reset_index()
agg_df = (agg_df.merge(incomes, how='left', on='user_id')
.merge(spending, how='left', on='user_id'))
agg_df.rename(columns={'amount_n26_currency_x': 'y_true_income',
'amount_n26_currency_y': 'y_true_expenses'}, inplace=True)
incomes_means = (raw_df[(raw_df['transaction_month'] != 0) &
(raw_df['explanation_type'].isin(in_transactions))]
.groupby('user_id')['amount_n26_currency'].sum()).reset_index()
incomes_means['amount_n26_currency'] = incomes_means['amount_n26_currency']/5 # 5 months average
spending_means = (raw_df[(raw_df['transaction_month'] != 0) &
(raw_df['explanation_type'].isin(out_transactions))]
.groupby('user_id')['amount_n26_currency'].sum()).reset_index()
spending_means['amount_n26_currency'] = spending_means['amount_n26_currency']/5
agg_df = agg_df.merge(incomes_means, how='left', on='user_id').merge(spending_means, how='left', on='user_id')
agg_df.rename(columns={'amount_n26_currency_x': 'y_mean_income',
'amount_n26_currency_y': 'y_mean_expenses'}, inplace=True)
df_income = agg_df[~agg_df['y_true_income'].isna()].drop('y_true_expenses', axis=1)
df_expenses = agg_df[~agg_df['y_true_expenses'].isna()].drop('y_true_income', axis=1)
return df_income, df_expenses
We will try to use 2 regressors, classic time series like ARIMA will be skipped for now due to time constraints but keep in mind for fututre: it could have good prediction power, specially for total incomes which may be stationary series in many cases. Baseline is simple average of incomes and expenses for the 5 previous months as calculated before.
class Model:
"""The class takes the model parameters which should be fixed and those which should be optimized
It also includes unified bayesian optimization object for hyperparameters."""
def __init__(self, model_object:object, fixed_params:dict, pbounds:dict, eval_func:object):
self.model_object = model_object
self.fixed_params = fixed_params
self.pbounds = pbounds
self.eval_func = eval_func
self.best_params = None
# Optimizer
def bayesian_optimizer(self) -> dict:
"""Optimizrer object creation"""
optimizer = BayesianOptimization(
f=self.eval_func,
pbounds=self.pbounds,
random_state=RANDOM_STATE
)
optimizer.maximize(init_points=10, n_iter=50)
# Print the best hyperparameters found
print(optimizer.max)
best_params = optimizer.max['params']
self.best_params = best_params
return best_params
def get_final_params(self) -> dict:
params = {**self.best_params, **self.fixed_params}
return params
# Bayesian optimization eval functions
def xgb_evaluate(learning_rate:float, max_depth:int, gamma:float, min_child_weight:float, subsample:float,
colsample_bytree:float, reg_lambda:float, alpha:float) -> float:
"""The function evaluates the cross validated mean metric on a given params for XGBRegressor"""
params = {
"objective": "reg:squarederror", # regression with squared loss
"booster": "gbtree", # fix the booster to save time
"eval_metric": "rmse", # root mean sqare error as chosen metric
"learning_rate": learning_rate,
"max_depth": int(max_depth), # should be int
"gamma": gamma,
"min_child_weight": min_child_weight,
"subsample": subsample,
"colsample_bytree": colsample_bytree,
"reg_lambda": reg_lambda,
"alpha": alpha,
"random_state": RANDOM_STATE
}
# Train
cv_results = xgb.cv(
params,
xgb.DMatrix(X_train, y_train),
num_boost_round=100, # max rounds (restricted further by early stopping rounds)
nfold=5,
metrics=['rmse'],
early_stopping_rounds=10, # 10 rounds if no score improvement to prevent overfitting
seed=RANDOM_STATE
)
return -1.0*cv_results["test-rmse-mean"].iloc[-1] # will maximize negative loss
def rf_evaluate(n_estimators:int, max_depth:int, min_samples_split:int, min_samples_leaf:int,
max_leaf_nodes:int) -> float:
"""The function evaluates the cross validated mean metric on a given params for RFRegressor"""
params = {
"criterion": 'squared_error',
"n_estimators": int(n_estimators),
"max_depth": int(max_depth),
"min_samples_split": int(min_samples_split),
"min_samples_leaf": int(min_samples_leaf),
"max_leaf_nodes": int(max_leaf_nodes),
"random_state": RANDOM_STATE
}
rf = RandomForestRegressor(**params)
# CV
kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE) # 5 shuffled folds
X, y = X_train, y_train
rmse_list = []
for train_index, test_index in kfold.split(X, y):
x_train_fold, x_test_fold = X[train_index], X[test_index]
y_train_fold, y_test_fold = y[train_index], y[test_index]
rf.fit(x_train_fold, y_train_fold.ravel())
preds = rf.predict(x_test_fold)
rmse_test = np.sqrt(mean_squared_error(y_test_fold, preds))
rmse_list.append(rmse_test)
score = np.array(rmse_list).mean()
return -1.0*score # will maximize negative score
# Some helper functions for splits, shap and metrics print
def data_split(df:pd.DataFrame, col_exclude:list, col_target:list, val_set:bool=True) -> np.ndarray:
"""Splits the data to train, val, test sets according to income/expenses purposes"""
X, y = df.drop(col_exclude, axis=1).values, df[col_target].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)
if val_set:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.33, random_state=RANDOM_STATE)
# val approx 10% of total
return X, y, X_train, X_test, X_val, y_train, y_test, y_val
else:
return X, y, X_train, X_test, y_train, y_test
def plot_feature_importances(df:pd.DataFrame, X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray,
col_exclude:list, model:xgb.XGBRegressor=None):
"""Trains basic XGBoost model, calculates and plots shap values"""
# Train simple model with default params if model not passed
if not model:
model = xgb.XGBRegressor(random_state=RANDOM_STATE)
model.fit(X_train, y_train)
# Calculate and plot shap values
# compute SHAP values
explainer = shap.Explainer(model, pd.DataFrame(data=X_test,
columns=list(df.drop(col_exclude, axis=1).columns)).values)
shap_values = explainer(pd.DataFrame(data=X_test, columns=list(df.drop(col_exclude, axis=1).columns)))
shap.summary_plot(shap_values, X_test, max_display=10)
shap.plots.waterfall(shap_values[33], max_display=10) # take one customer to plot waterfall features impacts
def print_metrics(df:pd.DataFrame, preds:np.ndarray, preds_train:np.ndarray,
preds_test:np.ndarray, purpose:str='expenses'):
"""Prints model's and baseline (simple average) r2 score and root mean square error metrics"""
metrics = pd.DataFrame(index=['R2', 'RMSE'])
metrics.loc['R2', 'Model - train'] = r2_score(y_train, preds_train)
metrics.loc['R2', 'Model - test'] = r2_score(y_test, preds_test)
metrics.loc['R2', 'Model - all'] = r2_score(y, preds)
metrics.loc['RMSE', 'Model - train'] = np.sqrt(mean_squared_error(y_train, preds_train))
metrics.loc['RMSE', 'Model - test'] = np.sqrt(mean_squared_error(y_test, preds_test))
metrics.loc['RMSE', 'Model - all'] = np.sqrt(mean_squared_error(y, preds))
metrics.loc['R2', 'Baseline - all'] = r2_score(df.dropna(subset=f'y_mean_{purpose}')[f'y_true_{purpose}'],
df.dropna(subset=f'y_mean_{purpose}')[f'y_mean_{purpose}'])
metrics.loc['RMSE', 'Baseline - all'] = np.sqrt(mean_squared_error(
df.dropna(subset=f'y_mean_{purpose}')[f'y_true_{purpose}'],
df.dropna(subset=f'y_mean_{purpose}')[f'y_mean_{purpose}']))
print(metrics)
# Split with validation set
X, y, X_train, X_test, X_val, y_train, y_test, y_val = data_split(df_expenses, ['user_id', 'y_true_expenses',
'y_mean_expenses', 'y_mean_income'],
['y_true_expenses'])
# Find the best params
xgb_pbounds = {
"learning_rate": (0.01, 0.5),
"max_depth": (2, 10),
"gamma": (0, 1),
"min_child_weight": (1, 20),
"subsample": (0.5, 1),
"colsample_bytree": (0.5, 1),
"reg_lambda": (0, 1),
"alpha": (0, 1)
}
fixed_params = {
'eval_metric': 'rmse',
'objective': 'reg:squarederror',
'booster': 'gbtree',
'random_state': RANDOM_STATE
}
model = Model(xgb.XGBRegressor, fixed_params, xgb_pbounds, eval_func=xgb_evaluate)
best_params = model.bayesian_optimizer()
params_xgb = model.get_final_params()
params_xgb['max_depth'] = int(params_xgb['max_depth']) # correct type
| iter | target | alpha | colsam... | gamma | learni... | max_depth | min_ch... | reg_la... | subsample | ------------------------------------------------------------------------------------------------------------------------- | 1 | -354.0 | 0.3745 | 0.9754 | 0.732 | 0.3033 | 3.248 | 3.964 | 0.05808 | 0.9331 | | 2 | -386.5 | 0.6011 | 0.854 | 0.02058 | 0.4853 | 8.66 | 5.034 | 0.1818 | 0.5917 | | 3 | -359.0 | 0.3042 | 0.7624 | 0.4319 | 0.1527 | 6.895 | 3.65 | 0.2921 | 0.6832 | | 4 | -365.2 | 0.4561 | 0.8926 | 0.1997 | 0.262 | 6.739 | 1.883 | 0.6075 | 0.5853 | | 5 | -375.3 | 0.06505 | 0.9744 | 0.9656 | 0.4061 | 4.437 | 2.856 | 0.6842 | 0.7201 | | 6 | -372.0 | 0.122 | 0.7476 | 0.03439 | 0.4556 | 4.07 | 13.59 | 0.3117 | 0.76 | | 7 | -367.8 | 0.5467 | 0.5924 | 0.9696 | 0.3898 | 9.516 | 18.0 | 0.5979 | 0.9609 | | 8 | -355.8 | 0.08849 | 0.598 | 0.04523 | 0.1694 | 5.109 | 6.156 | 0.8287 | 0.6784 | | 9 | -363.4 | 0.2809 | 0.7713 | 0.1409 | 0.4031 | 2.596 | 19.75 | 0.7722 | 0.5994 | | 10 | -381.9 | 0.005522 | 0.9077 | 0.7069 | 0.3672 | 8.17 | 2.407 | 0.3585 | 0.5579 | | 11 | -363.2 | 0.1358 | 0.8675 | 0.2992 | 0.4343 | 2.435 | 19.82 | 0.7686 | 0.6422 | | 12 | -360.9 | 0.3814 | 0.8861 | 0.5137 | 0.2463 | 3.424 | 4.737 | 0.08982 | 0.9229 | | 13 | -358.7 | 0.5576 | 1.0 | 0.7742 | 0.2886 | 2.386 | 3.982 | 0.0 | 1.0 | | 14 | -414.2 | 1.0 | 1.0 | 0.05647 | 0.01 | 3.179 | 3.65 | 0.0 | 1.0 | | 15 | -352.5 | 0.2615 | 0.91 | 0.9794 | 0.08411 | 8.747 | 15.2 | 0.6517 | 0.9533 | | 16 | -369.8 | 0.03328 | 0.8489 | 0.9709 | 0.3834 | 3.088 | 4.315 | 0.05754 | 0.8945 | | 17 | -352.0 | 0.5245 | 0.936 | 0.8696 | 0.136 | 2.317 | 4.022 | 0.2708 | 0.911 | | 18 | -353.1 | 0.4754 | 1.0 | 1.0 | 0.2104 | 2.798 | 3.859 | 0.429 | 0.8703 | | 19 | -359.5 | 0.8615 | 0.8371 | 0.07548 | 0.264 | 4.163 | 13.84 | 0.4846 | 0.8102 | | 20 | -375.3 | 0.1351 | 0.5141 | 0.4813 | 0.365 | 9.224 | 3.852 | 0.8274 | 0.8302 | | 21 | -364.0 | 0.5553 | 0.7505 | 0.5572 | 0.361 | 2.513 | 4.21 | 0.687 | 0.9375 | | 22 | -359.5 | 0.314 | 0.849 | 0.946 | 0.03939 | 2.447 | 3.431 | 0.333 | 0.7889 | | 23 | -385.8 | 0.959 | 0.5256 | 0.6285 | 0.4263 | 2.507 | 9.232 | 0.001175 | 0.5259 | | 24 | -381.3 | 0.5678 | 1.0 | 1.0 | 0.4752 | 2.289 | 3.884 | 0.4134 | 0.5 | | 25 | -379.9 | 0.6919 | 0.6025 | 0.7271 | 0.4339 | 2.562 | 2.827 | 0.1881 | 0.8 | | 26 | -358.7 | 0.7274 | 0.8142 | 0.9892 | 0.02725 | 3.068 | 4.104 | 0.08466 | 0.8902 | | 27 | -351.0 | 0.7241 | 0.6164 | 0.1373 | 0.07697 | 8.219 | 13.26 | 0.2427 | 0.7527 | | 28 | -368.9 | 0.7448 | 0.6291 | 0.2328 | 0.4731 | 2.183 | 12.07 | 0.6478 | 0.9433 | | 29 | -360.7 | 0.4184 | 0.6753 | 0.5531 | 0.2275 | 6.914 | 3.246 | 0.3205 | 0.9175 | | 30 | -352.9 | 0.2881 | 0.827 | 0.606 | 0.1277 | 7.023 | 3.694 | 0.374 | 0.7676 | | 31 | -351.1 | 0.6108 | 0.6883 | 0.3434 | 0.07868 | 8.348 | 13.74 | 0.3427 | 0.8018 | | 32 | -351.9 | 0.8687 | 0.76 | 0.2915 | 0.112 | 7.855 | 13.64 | 0.1206 | 0.949 | | 33 | -359.1 | 0.8427 | 0.559 | 0.3022 | 0.2473 | 8.005 | 13.49 | 0.7496 | 0.9507 | | 34 | -356.2 | 0.9562 | 0.8513 | 0.5632 | 0.172 | 8.28 | 13.49 | 0.0 | 0.506 | | 35 | -395.8 | 0.2961 | 0.9726 | 0.3688 | 0.01049 | 8.12 | 13.38 | 0.0284 | 1.0 | | 36 | -352.9 | 1.0 | 0.5279 | 0.1752 | 0.1056 | 8.196 | 13.67 | 0.2489 | 0.6854 | | 37 | -373.4 | 0.8701 | 0.6822 | 0.3967 | 0.4395 | 8.598 | 13.89 | 0.5318 | 0.9981 | | 38 | -356.6 | 0.6075 | 0.5813 | 0.3287 | 0.1834 | 8.476 | 13.59 | 0.4417 | 0.7922 | | 39 | -359.9 | 0.8584 | 0.5126 | 0.303 | 0.2813 | 5.62 | 18.04 | 0.5772 | 0.7155 | | 40 | -401.9 | 0.7343 | 0.6139 | 0.3689 | 0.01 | 7.985 | 13.58 | 0.3922 | 0.5 | | 41 | -353.9 | 0.8079 | 0.6396 | 0.196 | 0.1274 | 8.286 | 13.55 | 0.2246 | 0.8663 | | 42 | -353.9 | 1.0 | 0.637 | 0.1383 | 0.1449 | 8.05 | 13.46 | 0.1078 | 0.8701 | | 43 | -355.6 | 0.978 | 0.577 | 0.1339 | 0.1413 | 8.372 | 13.35 | 0.1236 | 0.6707 | | 44 | -363.3 | 0.8573 | 0.8481 | 0.1909 | 0.3313 | 4.189 | 13.81 | 0.5259 | 0.9063 | | 45 | -365.9 | 0.7399 | 0.5296 | 0.1714 | 0.2802 | 8.233 | 13.36 | 0.3519 | 0.5756 | | 46 | -374.3 | 0.8294 | 0.7177 | 0.7969 | 0.3994 | 6.712 | 4.234 | 0.1533 | 0.991 | | 47 | -357.1 | 0.6325 | 0.9405 | 0.1911 | 0.1181 | 7.803 | 11.36 | 0.4727 | 0.5235 | | 48 | -358.5 | 0.4096 | 0.8848 | 0.4869 | 0.149 | 6.769 | 3.727 | 0.4459 | 0.7371 | | 49 | -354.6 | 0.4064 | 0.924 | 0.7971 | 0.1274 | 3.182 | 3.938 | 0.4572 | 0.6078 | | 50 | -366.6 | 0.01216 | 0.8834 | 0.3602 | 0.3462 | 2.696 | 6.088 | 0.5755 | 0.582 | | 51 | -351.7 | 0.8133 | 0.5394 | 0.336 | 0.1889 | 5.599 | 18.16 | 0.5642 | 0.7877 | | 52 | -354.5 | 0.7225 | 0.6323 | 0.6165 | 0.1663 | 5.888 | 18.32 | 0.475 | 0.9072 | | 53 | -352.9 | 1.0 | 0.6852 | 0.2653 | 0.1069 | 8.159 | 13.86 | 0.03118 | 0.909 | | 54 | -405.5 | 0.7113 | 0.5985 | 0.4038 | 0.01 | 5.54 | 18.45 | 0.5366 | 0.9514 | | 55 | -377.1 | 0.4 | 0.6863 | 0.05949 | 0.3918 | 9.837 | 14.67 | 0.9294 | 0.6715 | | 56 | -357.8 | 0.4658 | 0.8209 | 0.9371 | 0.2327 | 2.26 | 4.152 | 0.3091 | 0.8284 | | 57 | -385.7 | 0.6511 | 0.6043 | 0.7456 | 0.4161 | 6.717 | 1.386 | 0.3751 | 0.9958 | | 58 | -360.8 | 0.8414 | 0.8377 | 0.2614 | 0.2636 | 8.31 | 13.82 | 0.1855 | 0.7981 | | 59 | -356.0 | 0.8109 | 0.5742 | 0.519 | 0.2049 | 5.879 | 18.09 | 0.5101 | 0.7698 | | 60 | -361.9 | 0.9179 | 0.7985 | 0.03127 | 0.3133 | 4.543 | 17.72 | 0.2111 | 0.5009 | ========================================================================================================================= {'target': -350.956127699383, 'params': {'alpha': 0.7240513996586844, 'colsample_bytree': 0.6164469320520245, 'gamma': 0.1373491328719495, 'learning_rate': 0.07696879383066403, 'max_depth': 8.219439248069524, 'min_child_weight': 13.2625221437565, 'reg_lambda': 0.2426589832630276, 'subsample': 0.7526880386728095}}
# Train the model with the best params
model_xgb_expenses = xgb.XGBRegressor(**params_xgb)
model_xgb_expenses.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
preds_xgb = model_xgb_expenses.predict(X)
preds_xgb_train = model_xgb_expenses.predict(X_train)
preds_xgb_test = model_xgb_expenses.predict(X_test)
# Save the model to use in the future
joblib.dump(model_xgb_expenses, 'model_xgb_expenses.joblib')
['model_xgb_expenses.joblib']
# Print performance metrics
print_metrics(df_expenses, preds_xgb, preds_xgb_train, preds_xgb_test, purpose='expenses')
Model - train Model - test Model - all Baseline - all R2 0.788920 0.560315 0.722985 0.457969 RMSE 223.038776 288.739278 244.698265 346.703241
Based on R2 overall score XGBoost Regressor is the good fit model to predict expenses. At the same we observe the evidence of overfitting which is acceptable for test purposes model and can be eliminated in the future, for example, by reducing the number of features or adding early stopping rounds to the final model when have better prepared dataset. The model performs 28% better than the simple average (baseline) in terms of RMSE and can be used as a first approximator to forecast customers' expenses.
plot_feature_importances(df_expenses, X_train, y_train, X_test,
['user_id', 'y_true_expenses', 'y_mean_expenses', 'y_mean_income'], model_xgb_expenses)
Previous months credit and debit transfers as well as ATM withdrawals are the most impactful on the total expenses of the current month. Moreover debit transfer is highly important variable up to 4 months lag. The decision to replace presentment with more detailed mcc groups was correct as the result we have ATM, Travel, Gas etc. categories among the most important.
Waterfall chart is presented to explain how particular features affected the final prediction for a randomly chosen customer. It shows cumulative effect of each feature starting from the expected output value E[f(X)].
features = list(df_expenses.columns)[1:-3]
# Impute NaN by replacing the missing values with median.
# It would be better to split customers into groups by expenses behavior and calculate medians for each of them,
# but let's simplify for now.
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(df_expenses[features])
df_expenses[features] = imputer.transform(df_expenses[features])
# Split without validation set
X, y, X_train, X_test, y_train, y_test = data_split(df_expenses, ['user_id', 'y_true_expenses',
'y_mean_expenses', 'y_mean_income'],
['y_true_expenses'], val_set=False)
# Find the best params
rf_pbounds = {
"n_estimators": (10, 100),
"max_depth": (1, 100),
"min_samples_split": (2, 200),
"min_samples_leaf": (1, 20),
"max_leaf_nodes": (5, 50)
}
rf_fixed_params = {
"criterion": 'squared_error',
"random_state": RANDOM_STATE
}
model = Model(RandomForestRegressor, rf_fixed_params, rf_pbounds, eval_func=rf_evaluate)
best_params_rf = model.bayesian_optimizer()
params_rf = model.get_final_params()
# Corect types
for i in ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'max_leaf_nodes']:
params_rf[i] = int(params_rf[i])
| iter | target | max_depth | max_le... | min_sa... | min_sa... | n_esti... | ------------------------------------------------------------------------------------- | 1 | -370.4 | 38.08 | 47.78 | 14.91 | 120.5 | 24.04 | | 2 | -382.2 | 16.44 | 7.614 | 17.46 | 121.0 | 73.73 | | 3 | -380.2 | 3.038 | 48.65 | 16.82 | 44.04 | 26.36 | | 4 | -368.9 | 19.16 | 18.69 | 10.97 | 87.53 | 36.21 | | 5 | -370.3 | 61.57 | 11.28 | 6.551 | 74.54 | 51.05 | | 6 | -375.3 | 78.73 | 13.99 | 10.77 | 119.3 | 14.18 | | 7 | -376.8 | 61.15 | 12.67 | 2.236 | 189.9 | 96.91 | | 8 | -369.9 | 81.03 | 18.71 | 2.856 | 137.5 | 49.61 | | 9 | -374.5 | 13.08 | 27.28 | 1.653 | 182.0 | 33.29 | | 10 | -370.5 | 66.59 | 19.03 | 10.88 | 110.2 | 26.64 | | 11 | -369.4 | 15.82 | 15.18 | 9.903 | 86.12 | 36.0 | | 12 | -369.0 | 42.69 | 21.29 | 1.0 | 88.69 | 21.95 | | 13 | -364.8 | 61.5 | 45.46 | 1.0 | 98.26 | 47.15 | | 14 | -364.8 | 88.9 | 50.0 | 1.0 | 101.8 | 62.04 | | 15 | -371.1 | 87.7 | 50.0 | 20.0 | 78.72 | 44.28 | | 16 | -364.4 | 68.03 | 50.0 | 1.0 | 101.3 | 84.25 | | 17 | -365.6 | 100.0 | 50.0 | 1.0 | 118.6 | 100.0 | | 18 | -361.6 | 100.0 | 50.0 | 1.0 | 72.98 | 100.0 | | 19 | -356.7 | 100.0 | 50.0 | 1.0 | 29.61 | 100.0 | | 20 | -363.8 | 100.0 | 23.63 | 1.0 | 2.0 | 100.0 | | 21 | -360.9 | 99.27 | 49.1 | 1.072 | 67.1 | 98.26 | | 22 | -356.5 | 67.44 | 50.0 | 1.0 | 27.78 | 100.0 | | 23 | -366.9 | 74.02 | 50.0 | 20.0 | 2.0 | 100.0 | | 24 | -360.2 | 77.31 | 27.07 | 1.0 | 40.45 | 100.0 | | 25 | -358.2 | 47.22 | 50.0 | 1.0 | 51.2 | 100.0 | | 26 | -357.3 | 74.61 | 50.0 | 1.0 | 46.73 | 100.0 | | 27 | -430.7 | 1.0 | 50.0 | 1.0 | 2.0 | 100.0 | | 28 | -362.8 | 100.0 | 50.0 | 1.0 | 2.0 | 10.0 | | 29 | -377.8 | 100.0 | 50.0 | 1.0 | 200.0 | 10.0 | | 30 | -390.4 | 100.0 | 5.0 | 20.0 | 2.0 | 10.0 | | 31 | -356.7 | 100.0 | 50.0 | 1.0 | 13.29 | 60.93 | | 32 | -428.7 | 1.0 | 50.0 | 20.0 | 200.0 | 100.0 | | 33 | -399.0 | 47.32 | 5.0 | 1.0 | 200.0 | 10.0 | | 34 | -395.2 | 100.0 | 5.0 | 1.0 | 108.7 | 100.0 | | 35 | -378.0 | 100.0 | 50.0 | 20.0 | 200.0 | 100.0 | | 36 | -358.5 | 64.87 | 50.0 | 1.0 | 36.61 | 58.93 | | 37 | -428.6 | 1.0 | 5.0 | 20.0 | 134.8 | 10.0 | | 38 | -363.9 | 100.0 | 50.0 | 1.0 | 49.86 | 10.0 | | 39 | -362.8 | 51.9 | 50.0 | 1.0 | 2.0 | 10.0 | | 40 | -370.6 | 61.71 | 50.0 | 1.0 | 158.7 | 51.93 | | 41 | -429.4 | 1.0 | 5.0 | 1.0 | 2.0 | 10.0 | | 42 | -405.4 | 2.861 | 50.0 | 1.0 | 92.32 | 100.0 | | 43 | -363.4 | 54.88 | 50.0 | 1.0 | 47.09 | 10.0 | | 44 | -373.0 | 100.0 | 50.0 | 1.0 | 128.7 | 10.0 | | 45 | -398.0 | 100.0 | 5.0 | 1.0 | 200.0 | 67.67 | | 46 | -367.3 | 74.98 | 50.0 | 20.0 | 11.64 | 33.4 | | 47 | -374.1 | 100.0 | 50.0 | 20.0 | 142.1 | 65.26 | | 48 | -388.8 | 42.93 | 5.0 | 20.0 | 55.25 | 100.0 | | 49 | -429.4 | 1.0 | 50.0 | 1.0 | 200.0 | 10.0 | | 50 | -397.4 | 30.99 | 5.0 | 1.0 | 169.5 | 60.77 | | 51 | -365.8 | 100.0 | 19.25 | 1.0 | 36.72 | 62.84 | | 52 | -366.8 | 100.0 | 50.0 | 20.0 | 35.05 | 77.05 | | 53 | -395.9 | 100.0 | 5.0 | 1.0 | 71.98 | 10.0 | | 54 | -361.1 | 40.74 | 50.0 | 1.0 | 63.77 | 57.91 | | 55 | -368.7 | 57.25 | 50.0 | 20.0 | 62.29 | 81.47 | | 56 | -362.2 | 75.98 | 29.75 | 1.0 | 2.0 | 66.83 | | 57 | -369.0 | 74.05 | 50.0 | 1.0 | 153.9 | 100.0 | | 58 | -356.7 | 100.0 | 50.0 | 1.0 | 2.0 | 87.43 | | 59 | -374.2 | 49.14 | 50.0 | 20.0 | 80.68 | 10.0 | | 60 | -359.1 | 100.0 | 50.0 | 1.0 | 47.72 | 48.97 | ===================================================================================== {'target': -356.54002319740357, 'params': {'max_depth': 67.43593423463622, 'max_leaf_nodes': 50.0, 'min_samples_leaf': 1.0, 'min_samples_split': 27.78479143652681, 'n_estimators': 100.0}}
# Train the model with the best params
model_rf_expenses = RandomForestRegressor(**params_rf)
model_rf_expenses.fit(X_train, y_train.ravel())
preds_rf = model_rf_expenses.predict(X)
preds_rf_train = model_rf_expenses.predict(X_train)
preds_rf_test = model_rf_expenses.predict(X_test)
# Save the model to use in the future
joblib.dump(model_rf_expenses, 'model_rf_expenses.joblib')
['model_rf_expenses.joblib']
print_metrics(df_expenses, preds_rf, preds_rf_train, preds_rf_test, purpose='expenses')
Model - train Model - test Model - all Baseline - all R2 0.734899 0.445998 0.666886 0.457969 RMSE 249.955097 306.950239 268.333865 346.703241
Random Forest model has worse overall performance results than XGBoost trained before. The model is still slighty better than the baseline, however we will use XGBoost as the main one to solve the task and make predicitions for Aug.
# Split with validation set
X, y, X_train, X_test, X_val, y_train, y_test, y_val = data_split(df_income, ['user_id', 'y_true_income',
'y_mean_expenses', 'y_mean_income'],
['y_true_income'])
# Find the best params
xgb_pbounds = {
"learning_rate": (0.01, 0.5),
"max_depth": (2, 10),
"gamma": (0, 1),
"min_child_weight": (1, 20),
"subsample": (0.5, 1),
"colsample_bytree": (0.5, 1),
"reg_lambda": (0, 1),
"alpha": (0, 1)
}
fixed_params = {
'eval_metric': 'rmse',
'objective': 'reg:squarederror',
'booster': 'gbtree',
'random_state': RANDOM_STATE
}
model = Model(xgb.XGBRegressor, fixed_params, xgb_pbounds, eval_func=xgb_evaluate)
best_params = model.bayesian_optimizer()
params_xgb = model.get_final_params()
params_xgb['max_depth'] = int(params_xgb['max_depth']) # correct type
| iter | target | alpha | colsam... | gamma | learni... | max_depth | min_ch... | reg_la... | subsample | ------------------------------------------------------------------------------------------------------------------------- | 1 | -294.8 | 0.3745 | 0.9754 | 0.732 | 0.3033 | 3.248 | 3.964 | 0.05808 | 0.9331 | | 2 | -326.3 | 0.6011 | 0.854 | 0.02058 | 0.4853 | 8.66 | 5.034 | 0.1818 | 0.5917 | | 3 | -297.8 | 0.3042 | 0.7624 | 0.4319 | 0.1527 | 6.895 | 3.65 | 0.2921 | 0.6832 | | 4 | -307.7 | 0.4561 | 0.8926 | 0.1997 | 0.262 | 6.739 | 1.883 | 0.6075 | 0.5853 | | 5 | -303.3 | 0.06505 | 0.9744 | 0.9656 | 0.4061 | 4.437 | 2.856 | 0.6842 | 0.7201 | | 6 | -301.2 | 0.122 | 0.7476 | 0.03439 | 0.4556 | 4.07 | 13.59 | 0.3117 | 0.76 | | 7 | -304.4 | 0.5467 | 0.5924 | 0.9696 | 0.3898 | 9.516 | 18.0 | 0.5979 | 0.9609 | | 8 | -297.4 | 0.08849 | 0.598 | 0.04523 | 0.1694 | 5.109 | 6.156 | 0.8287 | 0.6784 | | 9 | -297.4 | 0.2809 | 0.7713 | 0.1409 | 0.4031 | 2.596 | 19.75 | 0.7722 | 0.5994 | | 10 | -317.5 | 0.005522 | 0.9077 | 0.7069 | 0.3672 | 8.17 | 2.407 | 0.3585 | 0.5579 | | 11 | -291.9 | 0.2451 | 0.7864 | 0.6724 | 0.211 | 3.129 | 3.715 | 0.06695 | 0.9198 | | 12 | -334.2 | 0.0 | 0.5 | 0.0 | 0.01 | 2.285 | 3.913 | 0.1194 | 0.7334 | | 13 | -298.2 | 0.1363 | 0.8368 | 0.5112 | 0.02286 | 2.444 | 5.712 | 0.9713 | 0.8948 | | 14 | -293.2 | 0.2721 | 0.8334 | 0.9435 | 0.2669 | 3.525 | 3.455 | 0.1006 | 0.9739 | | 15 | -295.0 | 0.2615 | 0.91 | 0.9794 | 0.08411 | 8.747 | 15.2 | 0.6517 | 0.9533 | | 16 | -291.4 | 0.8367 | 0.5858 | 1.0 | 0.1309 | 3.137 | 3.516 | 0.0 | 1.0 | | 17 | -297.5 | 0.791 | 1.0 | 0.5508 | 0.5 | 3.177 | 3.105 | 0.0 | 1.0 | | 18 | -297.2 | 0.9429 | 0.772 | 0.9706 | 0.3278 | 3.328 | 3.635 | 0.8261 | 0.8655 | | 19 | -331.1 | 0.7972 | 0.5 | 0.4391 | 0.01 | 3.814 | 3.691 | 0.0 | 1.0 | | 20 | -313.2 | 0.1351 | 0.5141 | 0.4813 | 0.365 | 9.224 | 3.852 | 0.8274 | 0.8302 | | 21 | -295.1 | 0.5135 | 0.9094 | 1.0 | 0.3692 | 2.978 | 3.482 | 0.2547 | 0.9803 | | 22 | -294.6 | 0.1311 | 0.7924 | 0.7565 | 0.198 | 3.357 | 3.672 | 0.2115 | 0.8011 | | 23 | -303.7 | 0.959 | 0.5256 | 0.6285 | 0.4263 | 2.507 | 9.232 | 0.001175 | 0.5259 | | 24 | -330.7 | 0.3195 | 0.5595 | 1.0 | 0.01 | 3.034 | 3.055 | 0.0 | 1.0 | | 25 | -296.4 | 0.6919 | 0.6025 | 0.7271 | 0.4339 | 2.562 | 2.827 | 0.1881 | 0.8 | | 26 | -297.0 | 0.6109 | 0.7273 | 0.9666 | 0.3785 | 3.269 | 3.704 | 0.1171 | 0.852 | | 27 | -295.4 | 0.7241 | 0.6164 | 0.1373 | 0.07697 | 8.219 | 13.26 | 0.2427 | 0.7527 | | 28 | -298.7 | 0.7448 | 0.6291 | 0.2328 | 0.4731 | 2.183 | 12.07 | 0.6478 | 0.9433 | | 29 | -299.2 | 0.4184 | 0.6753 | 0.5531 | 0.2275 | 6.914 | 3.246 | 0.3205 | 0.9175 | | 30 | -296.4 | 0.6577 | 0.6879 | 0.2365 | 0.4589 | 2.223 | 11.98 | 0.6358 | 0.9017 | | 31 | -301.3 | 0.9379 | 0.7944 | 0.7099 | 0.3707 | 3.108 | 3.423 | 0.4202 | 0.6029 | | 32 | -291.2 | 0.5397 | 0.905 | 0.8363 | 0.06349 | 2.963 | 3.865 | 0.4651 | 0.9291 | | 33 | -292.4 | 0.05155 | 0.9848 | 0.8828 | 0.1489 | 3.038 | 3.96 | 0.8841 | 0.9696 | | 34 | -294.2 | 0.4826 | 0.8595 | 0.8447 | 0.3337 | 2.779 | 3.722 | 0.8328 | 0.6835 | | 35 | -295.4 | 0.248 | 1.0 | 0.6637 | 0.4877 | 3.141 | 3.713 | 0.5835 | 1.0 | | 36 | -295.7 | 0.1609 | 0.5989 | 0.9034 | 0.3718 | 3.062 | 4.36 | 0.8426 | 0.7708 | | 37 | -293.7 | 0.7426 | 0.6303 | 0.9041 | 0.214 | 2.803 | 4.183 | 0.4425 | 0.8168 | | 38 | -293.1 | 0.8512 | 0.844 | 0.6074 | 0.1145 | 2.905 | 3.971 | 0.8687 | 0.9788 | | 39 | -299.4 | 0.8584 | 0.5126 | 0.303 | 0.2813 | 5.62 | 18.04 | 0.5772 | 0.7155 | | 40 | -291.9 | 0.533 | 1.0 | 1.0 | 0.07208 | 2.74 | 4.278 | 0.9302 | 1.0 | | 41 | -330.8 | 0.5539 | 1.0 | 1.0 | 0.01 | 3.365 | 4.187 | 1.0 | 1.0 | | 42 | -291.8 | 0.693 | 0.8705 | 0.5348 | 0.1384 | 2.594 | 4.255 | 0.6766 | 0.6863 | | 43 | -293.9 | 0.3064 | 0.9143 | 0.8019 | 0.2675 | 2.618 | 4.104 | 0.6597 | 1.0 | | 44 | -292.7 | 0.8238 | 0.9792 | 0.9166 | 0.0464 | 2.483 | 3.975 | 0.7079 | 0.9282 | | 45 | -303.7 | 0.7399 | 0.5296 | 0.1714 | 0.2802 | 8.233 | 13.36 | 0.3519 | 0.5756 | | 46 | -301.6 | 0.8294 | 0.7177 | 0.7969 | 0.3994 | 6.712 | 4.234 | 0.1533 | 0.991 | | 47 | -296.1 | 0.6325 | 0.9405 | 0.1911 | 0.1181 | 7.803 | 11.36 | 0.4727 | 0.5235 | | 48 | -293.6 | 0.7792 | 0.6667 | 0.7286 | 0.135 | 2.563 | 4.26 | 0.9429 | 0.6074 | | 49 | -296.8 | 0.8112 | 0.97 | 0.7758 | 0.4817 | 2.559 | 4.277 | 0.8965 | 0.9885 | | 50 | -297.4 | 0.01216 | 0.8834 | 0.3602 | 0.3462 | 2.696 | 6.088 | 0.5755 | 0.582 | | 51 | -299.1 | 0.8133 | 0.5394 | 0.336 | 0.1889 | 5.599 | 18.16 | 0.5642 | 0.7877 | | 52 | -292.3 | 0.05786 | 0.6563 | 0.9951 | 0.2364 | 2.463 | 3.966 | 0.8673 | 0.6266 | | 53 | -292.5 | 1.0 | 0.8489 | 0.9921 | 0.09955 | 2.788 | 3.777 | 0.05048 | 1.0 | | 54 | -334.5 | 0.251 | 1.0 | 0.9996 | 0.01 | 2.43 | 4.356 | 0.9814 | 0.6526 | | 55 | -309.6 | 0.4 | 0.6863 | 0.05949 | 0.3918 | 9.837 | 14.67 | 0.9294 | 0.6715 | | 56 | -291.6 | 0.7723 | 0.7009 | 0.9389 | 0.07267 | 2.891 | 4.139 | 0.4051 | 0.848 | | 57 | -306.3 | 0.6511 | 0.6043 | 0.7456 | 0.4161 | 6.717 | 1.386 | 0.3751 | 0.9958 | | 58 | -294.0 | 0.8112 | 0.6407 | 0.8737 | 0.15 | 2.736 | 3.739 | 0.5234 | 1.0 | | 59 | -291.4 | 0.2674 | 0.5771 | 0.903 | 0.1185 | 2.956 | 3.742 | 0.5306 | 0.9445 | | 60 | -299.8 | 0.9179 | 0.7985 | 0.03127 | 0.3133 | 4.543 | 17.72 | 0.2111 | 0.5009 | ========================================================================================================================= {'target': -291.21863099733775, 'params': {'alpha': 0.5397396119280753, 'colsample_bytree': 0.9049698974993343, 'gamma': 0.836309973758812, 'learning_rate': 0.06348754651296912, 'max_depth': 2.962591823399781, 'min_child_weight': 3.8649056740873413, 'reg_lambda': 0.4651277468079176, 'subsample': 0.9291450258728853}}
# Train the model with the best params
model_xgb_income = xgb.XGBRegressor(**params_xgb)
model_xgb_income.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
preds_xgb = model_xgb_income.predict(X)
preds_xgb_train = model_xgb_income.predict(X_train)
preds_xgb_test = model_xgb_income.predict(X_test)
# Save the model to use in the future
joblib.dump(model_xgb_income, 'model_xgb_income.joblib')
['model_xgb_income.joblib']
# Print performance metrics
print_metrics(df_income, preds_xgb, preds_xgb_train, preds_xgb_test, purpose='income')
Model - train Model - test Model - all Baseline - all R2 0.485858 0.409088 0.453465 0.264807 RMSE 263.793550 306.281841 275.742296 323.849975
The model has low R2 which shows that the XGBoost regression does not well explain the variance in y_true and may be not a good fit. Nevertheless it's still 17% better than the baseline in terms of RMSE.
plot_feature_importances(df_income, X_train, y_train, X_test,
['user_id', 'y_true_income', 'y_mean_expenses', 'y_mean_income'], model_xgb_income)
Previous 4 months credit transfers are the most important features to predict incomes. The higher the credit transfer value the higher predicted income. It's expected as credit transfer reflects the salary which has usually the highest share in total incomes and stable over time.
Waterfall chart is presented to explain how particular features affected the final prediction for a randomly chosen customer. It shows cumulative effect of each feature starting from the expected output value E[f(X)].
features = list(df_income.columns)[1:-3]
# Impute NaN by replacing the missing values with median.
# It would be better to split customers into groups by expenses behavior and calculate medians for each of them,
# but let's simplify for now.
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(df_income[features])
df_income[features] = imputer.transform(df_income[features])
# Split without validation set
X, y, X_train, X_test, y_train, y_test = data_split(df_income, ['user_id', 'y_true_income',
'y_mean_expenses', 'y_mean_income'],
['y_true_income'], val_set=False)
# Find the best params
rf_pbounds = {
"n_estimators": (10, 100),
"max_depth": (1, 100),
"min_samples_split": (2, 200),
"min_samples_leaf": (1, 20),
"max_leaf_nodes": (5, 50)
}
rf_fixed_params = {
"criterion": 'squared_error',
"random_state": RANDOM_STATE
}
model = Model(RandomForestRegressor, rf_fixed_params, rf_pbounds, eval_func=rf_evaluate)
best_params_rf = model.bayesian_optimizer()
params_rf = model.get_final_params()
# Corect types
for i in ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'max_leaf_nodes']:
params_rf[i] = int(params_rf[i])
| iter | target | max_depth | max_le... | min_sa... | min_sa... | n_esti... | ------------------------------------------------------------------------------------- | 1 | -296.0 | 38.08 | 47.78 | 14.91 | 120.5 | 24.04 | | 2 | -300.3 | 16.44 | 7.614 | 17.46 | 121.0 | 73.73 | | 3 | -300.2 | 3.038 | 48.65 | 16.82 | 44.04 | 26.36 | | 4 | -295.4 | 19.16 | 18.69 | 10.97 | 87.53 | 36.21 | | 5 | -295.9 | 61.57 | 11.28 | 6.551 | 74.54 | 51.05 | | 6 | -296.8 | 78.73 | 13.99 | 10.77 | 119.3 | 14.18 | | 7 | -295.2 | 61.15 | 12.67 | 2.236 | 189.9 | 96.91 | | 8 | -294.7 | 81.03 | 18.71 | 2.856 | 137.5 | 49.61 | | 9 | -296.1 | 13.08 | 27.28 | 1.653 | 182.0 | 33.29 | | 10 | -295.7 | 66.59 | 19.03 | 10.88 | 110.2 | 26.64 | | 11 | -295.3 | 77.3 | 30.53 | 1.0 | 115.5 | 55.08 | | 12 | -297.1 | 58.87 | 8.633 | 4.073 | 190.8 | 99.52 | | 13 | -299.9 | 58.82 | 7.15 | 1.68 | 22.7 | 30.99 | | 14 | -295.5 | 47.39 | 49.69 | 18.52 | 87.5 | 38.4 | | 15 | -295.6 | 80.95 | 12.91 | 7.018 | 55.23 | 75.33 | | 16 | -295.9 | 49.64 | 16.51 | 14.97 | 120.0 | 81.82 | | 17 | -295.5 | 8.471 | 41.54 | 9.867 | 62.55 | 36.89 | | 18 | -295.3 | 9.168 | 45.67 | 12.28 | 150.5 | 77.33 | | 19 | -295.7 | 73.09 | 35.04 | 19.9 | 82.5 | 58.49 | | 20 | -294.5 | 14.23 | 38.18 | 6.726 | 39.53 | 30.42 | | 21 | -293.7 | 27.87 | 34.32 | 2.181 | 149.9 | 96.33 | | 22 | -296.5 | 17.22 | 9.549 | 5.521 | 182.6 | 81.57 | | 23 | -296.5 | 77.97 | 21.09 | 16.91 | 187.1 | 24.68 | | 24 | -295.0 | 51.06 | 28.02 | 16.04 | 67.42 | 93.02 | | 25 | -294.8 | 86.52 | 30.75 | 8.909 | 38.28 | 35.33 | | 26 | -293.9 | 62.86 | 39.34 | 8.287 | 39.61 | 85.38 | | 27 | -299.2 | 57.05 | 7.347 | 7.919 | 34.32 | 23.94 | | 28 | -296.1 | 54.07 | 41.07 | 16.69 | 110.8 | 59.16 | | 29 | -297.7 | 7.795 | 8.115 | 3.912 | 68.96 | 43.59 | | 30 | -296.8 | 44.21 | 11.8 | 11.99 | 145.5 | 22.23 | | 31 | -296.4 | 24.35 | 33.09 | 17.16 | 170.7 | 82.97 | | 32 | -296.1 | 43.73 | 42.58 | 12.95 | 27.74 | 12.59 | | 33 | -294.0 | 98.43 | 21.87 | 4.917 | 149.3 | 70.71 | | 34 | -295.9 | 65.41 | 43.32 | 8.151 | 192.5 | 28.86 | | 35 | -296.3 | 46.88 | 39.26 | 14.85 | 180.6 | 19.45 | | 36 | -293.8 | 29.56 | 48.71 | 4.751 | 191.8 | 97.86 | | 37 | -301.8 | 77.71 | 5.463 | 5.154 | 102.5 | 43.06 | | 38 | -300.4 | 62.91 | 6.254 | 10.92 | 122.9 | 99.82 | | 39 | -295.4 | 61.71 | 11.07 | 3.419 | 187.9 | 97.34 | | 40 | -293.5 | 21.64 | 38.54 | 4.918 | 149.5 | 90.5 | | 41 | -294.5 | 23.98 | 43.5 | 1.0 | 156.6 | 95.99 | | 42 | -295.3 | 29.36 | 37.09 | 13.49 | 152.3 | 94.4 | | 43 | -294.5 | 16.71 | 36.57 | 1.0 | 145.0 | 99.1 | | 44 | -294.2 | 19.25 | 29.01 | 1.0 | 154.7 | 89.79 | | 45 | -294.7 | 28.51 | 35.54 | 1.0 | 143.0 | 83.93 | | 46 | -294.3 | 90.33 | 28.01 | 2.757 | 139.6 | 63.41 | | 47 | -294.5 | 87.63 | 18.24 | 2.343 | 153.9 | 60.11 | | 48 | -294.2 | 69.46 | 22.82 | 1.0 | 182.2 | 89.01 | | 49 | -294.3 | 45.4 | 41.9 | 1.0 | 188.9 | 95.94 | | 50 | -294.1 | 60.53 | 38.02 | 11.88 | 54.15 | 76.35 | | 51 | -295.1 | 56.31 | 27.6 | 20.0 | 43.45 | 85.1 | | 52 | -293.2 | 67.69 | 41.11 | 4.994 | 55.21 | 92.99 | | 53 | -294.3 | 53.65 | 47.13 | 1.0 | 52.74 | 90.61 | | 54 | -293.5 | 75.62 | 50.0 | 4.393 | 50.56 | 82.47 | | 55 | -294.7 | 68.85 | 50.0 | 19.1 | 51.81 | 91.94 | | 56 | -293.9 | 80.6 | 37.12 | 1.0 | 44.41 | 93.71 | | 57 | -293.9 | 76.86 | 49.59 | 2.863 | 68.02 | 91.19 | | 58 | -294.4 | 75.87 | 35.98 | 1.0 | 61.31 | 80.61 | | 59 | -294.7 | 89.37 | 50.0 | 1.0 | 55.41 | 100.0 | | 60 | -294.6 | 68.78 | 50.0 | 1.0 | 39.14 | 100.0 | ===================================================================================== {'target': -293.19696809368804, 'params': {'max_depth': 67.69096436063744, 'max_leaf_nodes': 41.105943819266024, 'min_samples_leaf': 4.993577461489025, 'min_samples_split': 55.214391665570396, 'n_estimators': 92.98846851625616}}
# Train the model with the best params
model_rf_income = RandomForestRegressor(**params_rf)
model_rf_income.fit(X_train, y_train.ravel())
preds_rf = model_rf_income.predict(X)
preds_rf_train = model_rf_income.predict(X_train)
preds_rf_test = model_rf_income.predict(X_test)
# Save the model to use in the future
joblib.dump(model_rf_income, 'model_rf_income.joblib')
['model_rf_income.joblib']
print_metrics(df_income, preds_rf, preds_rf_train, preds_rf_test, purpose='income')
Model - train Model - test Model - all Baseline - all R2 0.531038 0.374497 0.481231 0.264807 RMSE 251.936692 304.086512 268.646700 323.849975
Random forest showed slightly better results comparing to XGBoost performance in terms of RMSE but worse in terms of generalization (higher overfitting), it may also be not a perfect fit model for the given data and has a room for improvement. However, it performs 20% better than the baseline and both models can be used as good first iteration predictors. For current test task purposes, let's use XGBoost to predict expenses and RandomForest to predict income for Aug month.
def make_predictions(raw_df:pd.DataFrame, models_list:list=[model_xgb_expenses, model_rf_income],
transaction_types:pd.DataFrame=transaction_types, mcc_group:pd.DataFrame=mcc_group,
features:list=features_names, purpose:str='expenses', month_predict:str='2016-08') -> Union[np.ndarray, pd.DataFrame]:
# purpose "income"
"""
The function takes input dataframe, does all necessary preprocessing steps,
returns the expenses predictions and shows RMSE, R2 metrics with the baseline comparison.
"""
# Assuming that the Aug dataframe is in the same format as 2016-09-19_79351_training.csv
# Preprocessing. Use the function we defined before.
df_income, df_expenses = preprocessing(raw_df, transaction_types, mcc_group, month_predict)
# Make predictions and metrics
if purpose == 'expenses':
preds = models_list[0].predict(df_expenses[features])
df = df_expenses.copy()
y = df_expenses['y_true_expenses']
elif purpose == 'income':
# Add NaN imputation
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(df_income[features])
df_income[features] = imputer.transform(df_income[features])
preds = models_list[1].predict(df_income[features])
df = df_income.copy()
y = df_income['y_true_income']
# And metrics
metrics = pd.DataFrame(index=['R2', 'RMSE'])
metrics.loc['R2', 'Model - all'] = r2_score(y, preds)
metrics.loc['RMSE', 'Model - all'] = np.sqrt(mean_squared_error(y, preds))
metrics.loc['R2', 'Baseline - all'] = r2_score(df.dropna(subset=f'y_mean_{purpose}')[f'y_true_{purpose}'],
df.dropna(subset=f'y_mean_{purpose}')[f'y_mean_{purpose}'])
metrics.loc['RMSE', 'Baseline - all'] = np.sqrt(mean_squared_error(
df.dropna(subset=f'y_mean_{purpose}')[f'y_true_{purpose}'],
df.dropna(subset=f'y_mean_{purpose}')[f'y_mean_{purpose}']))
return preds, metrics
# Test it on July data (month_predict = '2016-07')
test_df = pd.read_csv('2016-09-19_79351_training.csv')
preds, metrics = make_predictions(test_df, purpose='expenses', month_predict='2016-07')
metrics
| Model - all | Baseline - all | |
|---|---|---|
| R2 | 0.722985 | 0.457969 |
| RMSE | 244.698265 | 346.703241 |
# And for income as well
preds, metrics = make_predictions(test_df, purpose='income', month_predict='2016-07')
metrics
| Model - all | Baseline - all | |
|---|---|---|
| R2 | 0.481231 | 0.264807 |
| RMSE | 268.646700 | 323.849975 |
Same results as during model development! all works fine :)
The idea is to deploy the model as a microservice app. To do that we need, first of all, fix all dependencies and packages using for example pipfile from pipenv and don't forget to serialize all necessary objects (models, tables, scalers etc.). Then do the following steps:
For example, it can be in-app message: "Based on your income/expenses history, your next month incomes are expected to be higher (lower) than expenses." or similar. The data can be also used for internal purposes, for example, customers' creditworthness assessment which will affect credit products applications decisons. 5. Don't forget to test the endpoint by sending test requests.
The models are useful for the initial purposes. The XGboost and RandomForest models can be used to predict expenses/incomes per customer given 5 previous months of the historical transactions data. Moreover, the outcomes can be used not only as an in-app feature for customers but also as one of the variables to predict customers credit score which will be useful for making automated credit decisions or for any other underwriting purposes.
The both models perform up to 28% better than the baseline (simple average) in terms of RMSE, however, the income model has relatively low R2 score which maybe the evidence of not good fit. So, we can try other specifications in the future like classic time-series models (i.e. ARMA) or NN (LSTM). The data itself can also be prepared in better way, for example, to split the customers into cohorts and impute NaN by calculating mean (median) within each group or add some new features (i.e. income/expenses) ratio. This will require more time and can be done in the future work.
Currently we used R2 score (shows proportion of the variance in target y explained by independent variables X) metric to evaluate the goodness of model's fit and RMSE which is the square root of average squared differences between predictions and actuals. It's commony used for financial data. However, couple more metrics can be added: